## Warning: package 'BiocGenerics' was built under R version 4.0.5
## Warning: package 'GenomeInfoDb' was built under R version 4.0.5
load("~/Documents/MiGASti/Databases/gene_matrix.RData")
metadata <- read.table("~/Documents/MiGASti/Databases/metadata.txt")
#set rownames to Sample
setwd("~/Documents/MiGASti/Databases")
row.names(metadata) <- metadata$Sample
exclude <- read.table("samples2remove.txt")
exclude <- exclude$x
genes_counts_filt = genes_counts[, !colnames(genes_counts) %in% exclude]
#Excludes the samples from filters.
#dim(genes_tpm_filt)
metadata_filt = metadata[ !(rownames(metadata) %in% exclude), ]
length(metadata_filt)[1] 38
#remove low count genes
cpm <- cpm(genes_counts_filt)
# CPM >= 1 in at least 50% of the samples
keep.exp <- rowSums(cpm > 1) >= (0.5 * ncol(genes_counts_filt) )
genes_counts_filt1 <- genes_counts_filt[ keep.exp, ] #18625 genes
#performing voom normalisation on data
counts_voom <- limma::voom(genes_counts_filt1)
genes_counts_voom <- counts_voom$E
#order metadata and genes counts
#rownames(metadata)
#colnames(genes_counts_voom)
genes_counts_ordered <- genes_counts_voom[,rownames(metadata_filt)]
#head(genes_counts_ordered)
all(rownames(metadata_filt) == colnames (genes_counts_ordered)) #TRUE[1] TRUE
#After filtering: 496 samples in total
pca = prcomp(t(genes_counts_ordered), scale. = TRUE, center = TRUE)
autoplot(pca, data= metadata_filt, colour = 'Stimulation', shape = FALSE, label.size = 2)#After filtering: 454 samples in total
#remove ununstim samples in metadata
metadata_cultured <- metadata_filt[metadata_filt$Stimulation != "ununstim",]
#check numbers
dim(metadata_cultured)[1] 454 38
#check numbers per stimulation
table(metadata_filt$Stimulation) ATP DEX IFNy IL4 LPS R848 TNFa unstim
5 10 79 8 128 42 47 135
ununstim 42
#remove samples in genes counts datafile
genes_counts_cultured <- genes_counts_ordered[,metadata_cultured$Sample]
res.pca = prcomp(t(genes_counts_cultured))
g1 <- autoplot(res.pca, data = metadata_cultured, colour = 'Stimulation')
allResiduals <- removeBatchEffect(x = genes_counts_cultured,
design = model.matrix(~ Stimulation, data = metadata_cultured),
covariates = as.matrix(metadata_cultured[, c("picard_pct_mrna_bases", "picard_pct_ribosomal_bases", "picard_pct_pf_reads_aligned", "picard_pct_percent_duplication")]))
res.pca = prcomp(t(allResiduals))
g6 <- autoplot(res.pca, data = metadata_cultured, colour = 'Stimulation')
# dev.off()
ggarrange(g1,g6, labels = c("a", "b"))#how many PCS with > 95% variance in total?
#summary(pca)
#Proportion of variance per PC
PoV <- pca$sdev^2/sum(pca$sdev^2)
# Calculate cumulative percents for each PC
cumu <- cumsum(PoV)
# Determine which PC exhibits cumulative percent greater than 95% and % variation associated with the PC as less than 20
co1 <- which(cumu > 0.95)[1]
co1 [1] 205
#Including uncultured: 496 samples in total
#set metadata to no capitals
names(metadata_filt) = tolower(names(metadata_filt))
indx <- sapply(metadata, is.character)
metadata[indx] <- lapply(metadata[indx], function(x) as.factor(x))
#include covariates
covariates = c( "donor_id",
"region",
"stimulation",
"number",
"average_library_size",
"qubit_conc",
"diagnosis",
"main_diagnosis",
"sex",
"age",
"pmd_minutes",
"ph",
"cause_of_death_categories",
"smoking",
"alcohol_dependence_daily_use",
"autoimmune_diseases",
"infection_2weeks",
"opiates",
"benzodiazepines",
"psychopharmaca",
"year_collected",
"trimmomatic_dropped_pct",
"picard_pct_mrna_bases",
"picard_pct_ribosomal_bases",
"picard_pct_intronic_bases",
"picard_pct_intergenic_bases",
"picard_pct_pf_reads_aligned",
"picard_summed_median",
"picard_summed_mean",
"picard_pct_percent_duplication",
"star_total_reads",
"star_uniquely_mapped",
"star_uniquely_mapped_percent",
"featurecounts_assigned")
#create format matrix
matrix_rsquared = matrix(NA, nrow = length(covariates), ncol = 20) #Number of factors
matrix_pvalue = matrix(NA, nrow = length(covariates), ncol = 20)
#lineair model function
for (x in 1:length(covariates)){
for (y in 1:20){
matrix_rsquared[x,y] <- summary( lm(pca$x[,y] ~ metadata_filt[,covariates[x]]) )$adj.r.squared
matrix_pvalue[x,y] <- tidy( lm(pca$x[,y] ~ metadata_filt[,covariates[x]]) )$p.value[2] #To insert pvalues in the heatmap
}
}
#fill matrix with values
matrix_rsquared <- as.data.frame(matrix_rsquared)
matrix_pvalue <- as.data.frame(matrix_pvalue)
rownames(matrix_rsquared) <- covariates
rownames(matrix_pvalue) <- covariates
#create heatmap with rsquared values
pheatmap(matrix_rsquared, main = "Correlation (Rsquared) between variables and first 20 PCs", legend = TRUE)#Variance mainly driven by ununstim
PC3 <- pca$x[,3]
Stimulation <- metadata_filt$stimulation
df = data.frame(Stimulation, PC3)
ggplot(data = df, mapping = aes(x = Stimulation, y = PC3)) +
geom_boxplot()#Excluding uncultured: 454 samples in total
#remove ununstim samples in metadata
metadata_cultured <- metadata_filt[metadata_filt$stimulation != "ununstim",]
#check numbers
dim(metadata_cultured)[1] 454 38
#check numbers per stimulation
table(metadata_filt$stimulation) ATP DEX IFNy IL4 LPS R848 TNFa unstim
5 10 79 8 128 42 47 135
ununstim 42
#remove samples in genes counts datafile
genes_counts_cultured <- genes_counts_ordered[,metadata_cultured$sample]
#table(metadata_filt$stimulation)
#lenght(metadata_cultured)
pca = prcomp(t(genes_counts_cultured), scale. = TRUE, center = TRUE)
#set metadata to no capitals
names(metadata_cultured) = tolower(names(metadata_cultured))
indx <- sapply(metadata, is.character)
metadata[indx] <- lapply(metadata[indx], function(x) as.factor(x))
#include covariates
covariates = c( "donor_id",
"region",
"stimulation",
"number",
"average_library_size",
"qubit_conc",
"diagnosis",
"main_diagnosis",
"sex",
"age",
"pmd_minutes",
"ph",
"cause_of_death_categories",
"smoking",
"alcohol_dependence_daily_use",
"autoimmune_diseases",
"infection_2weeks",
"opiates",
"benzodiazepines",
"psychopharmaca",
"year_collected",
"trimmomatic_dropped_pct",
"picard_pct_mrna_bases",
"picard_pct_ribosomal_bases",
"picard_pct_intronic_bases",
"picard_pct_intergenic_bases",
"picard_pct_pf_reads_aligned",
"picard_summed_median",
"picard_summed_mean",
"picard_pct_percent_duplication",
"star_total_reads",
"star_uniquely_mapped",
"star_uniquely_mapped_percent",
"featurecounts_assigned")
#create format matrix
matrix_rsquared = matrix(NA, nrow = length(covariates), ncol = 20) #Number of factors
matrix_pvalue = matrix(NA, nrow = length(covariates), ncol = 20)
#lineair model function
for (x in 1:length(covariates)){
for (y in 1:20){
matrix_rsquared[x,y] <- summary( lm(pca$x[,y] ~ metadata_cultured[,covariates[x]]) )$adj.r.squared
matrix_pvalue[x,y] <- tidy( lm(pca$x[,y] ~ metadata_cultured[,covariates[x]]) )$p.value[2] #To insert pvalues in the heatmap
}
}
#fill matrix with values
matrix_rsquared <- as.data.frame(matrix_rsquared)
matrix_pvalue <- as.data.frame(matrix_pvalue)
rownames(matrix_rsquared) <- covariates
rownames(matrix_pvalue) <- covariates
#create heatmap with rsquared values
pheatmap(matrix_rsquared, main = "Correlation (Rsquared) between variables and first 20 PCs", legend = TRUE)PC16 <- pca$x[,16]
Stimulation <- metadata_cultured$stimulation
df = data.frame(Stimulation, PC16)
ggplot(data = df, mapping = aes(x = Stimulation, y = PC16)) +
geom_boxplot()PC12 <- pca$x[,12]
Stimulation <- metadata_cultured$stimulation
df = data.frame(Stimulation, PC12)
ggplot(data = df, mapping = aes(x = Stimulation, y = PC12)) +
geom_boxplot()metadata_GFM = metadata_cultured[metadata_cultured$region=="GFM",]
#check numbers
#dim(metadata_GFM)
#check numbers per stimulation
#table(metadata_cultured$region)
#remove samples in genes counts datafile
genes_counts_GFM <- genes_counts_cultured[,metadata_GFM$sample]
pca = prcomp(t(genes_counts_GFM), scale. = TRUE, center = TRUE)
autoplot(pca, colour = "stimulation", data = metadata_GFM)metadata_SVZ = metadata_cultured[metadata_cultured$region=="SVZ",]
#check numbers
#dim(metadata_SVZ)
#check numbers per stimulation
#table(metadata_cultured$region)
#remove samples in genes counts datafile
genes_counts_SVZ <- genes_counts_cultured[,metadata_SVZ$sample]
pca = prcomp(t(genes_counts_SVZ), scale. = TRUE, center = TRUE)
autoplot(pca, colour = "stimulation", data = metadata_SVZ)rv <- rowVars(genes_counts_cultured)
select <- order(rv, decreasing = TRUE)[1:1000]
pca <- prcomp(t(genes_counts_cultured[select, ]), scale. = TRUE, center = TRUE)
autoplot(pca, data= metadata_cultured, colour = 'stimulation', shape = FALSE, label.size = 2)
autoplot(pca, colour = "stimulation", data = metadata_cultured)rv <- rowVars(genes_counts_cultured)
select <- order(rv, decreasing = TRUE)[1:500]
pca <- prcomp(t(genes_counts_cultured[select, ]), scale. = TRUE, center = TRUE)
autoplot(pca, data= metadata_cultured, colour = 'stimulation', shape = FALSE, label.size = 2)
autoplot(pca, colour = "stimulation", data = metadata_cultured)